SE object created from exploratory analysis.
Gene selection according to Biotype already performed
SE_Bio <- readRDS(paste0(params$SE_Bio))Duplicated gene names are dropped and gene IDs are set as rownames.
The number of duplicated GeneName is: 11899
The number of duplicated Ensembl Genes with version is: 0
SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment
## dim: 24708 36
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')
padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_thresholdOnly day25 Cortical Brain Organoids are kept
SE_Bio_d25CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd25']
colData(SE_Bio_d25CbO)
## DataFrame with 13 rows and 11 columns
## SampleNumber SampleID_GU SampleID_OG SampleName SampleType Genotype
## <integer> <character> <character> <character> <character> <character>
## OLE_007 5 D25_2_WT D25_2_WT ORG_CHD2_WT_round2A_.. CorticalOrganoids WT
## OLE_008 6 D25_2_HT D25_2_HT ORG_CHD2_HT_round2A_.. CorticalOrganoids HT
## OLE_013 9 D25_3b_WT D25_3b_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids WT
## OLE_014 10 D25_3b_HT D25_3b_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids HT
## OLE_031 18 OLE_031 OL06 6_Evo1_d25_WT3 CorticalOrganoids WT
## ... ... ... ... ... ... ...
## OLE_035 22 OLE_035 OL10 10_Evo3_d25_WTC_1-2 CorticalOrganoids WT
## OLE_036 23 OLE_036 OL11 11_Evo3_d25_A5_1-2 CorticalOrganoids AR
## OLE_037 24 OLE_037 OL12 12_Evo4_d25_WTC-A CorticalOrganoids WT
## OLE_038 25 OLE_038 OL13 13_Evo4_d25_A5-A CorticalOrganoids AR
## OLE_039 26 OLE_039 OL14 14_Evo4_d25_HET-A CorticalOrganoids HT
## Timepoint Batch SeqRun HRID lib.size
## <character> <character> <integer> <character> <numeric>
## OLE_007 d25 Round2 1 OLE_007 17223455
## OLE_008 d25 Round2 1 OLE_008 18879578
## OLE_013 d25 Round3 1 OLE_013 20419528
## OLE_014 d25 Round3 1 OLE_014 41252167
## OLE_031 d25 Evo1 2 OLE_031 24061399
## ... ... ... ... ... ...
## OLE_035 d25 Evo3 2 OLE_035 18643755
## OLE_036 d25 Evo3 2 OLE_036 21633783
## OLE_037 d25 Evo4 2 OLE_037 22432181
## OLE_038 d25 Evo4 2 OLE_038 20928321
## OLE_039 d25 Evo4 2 OLE_039 19103731CountMatrix <- assays(SE_Bio_d25CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d25CbO))
all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUEdds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d25CbO)$counts, DataFrame(colData(SE_Bio_d25CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors
mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d25CbO)))
dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")
dds$Genotype
## [1] WT HT WT HT WT AR HT HT WT AR WT AR HT
## Levels: WT AR HT
dds
## class: DESeqDataSet
## dim: 24708 13
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(13): OLE_007 OLE_008 ... OLE_038 OLE_039
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeZeroPlot <- CountMatrix %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "col", values_to = "value") %>%
filter(value == 0) %>%
group_by(col) %>%
summarise(zerocount = n()) %>%
ggplot(., aes(y=col, x=zerocount)) +
geom_col(col='black', fill='#76c8c8') +
coord_flip() +
geom_label(aes(label=zerocount)) +
labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
y='', x='') +
theme_bw() +
theme(axis.text = element_text(colour = 'black', size=7),
axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
plot.title = element_text(hjust = 0.5, size = 7))
ZeroPlotkeep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3
table(keep)
## keep
## FALSE TRUE
## 6927 17781
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 17781 13
## metadata(1): version
## assays(1): counts
## rownames(17781): TSPAN6 TNMD ... PDCD6-AHRR LNCDAT
## rowData names(9): Gene EnsGene ... Start End
## colnames(13): OLE_007 OLE_008 ... OLE_038 OLE_039
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeA dds object containing information about 17781 genes in 13 samples is tested for differential expression.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
## OLE_007 OLE_008 OLE_013 OLE_014 OLE_031 OLE_032 OLE_033 OLE_034 OLE_035
## 0.7693623 0.8724444 0.9194546 1.8788253 1.1027455 1.0629631 0.9634343 1.0544527 0.8574659
## OLE_036 OLE_037 OLE_038 OLE_039
## 0.9944899 1.0366067 0.9750425 0.8666412select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))
rownames(df) <- colnames(ntd)
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=TRUE, annotation_col=df, border_color = 'black')vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors, main = 'Samples Distances', name = 'vst')SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))
par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
plotDensity(log2(counts(dds)), col=ScaledCols,
xlab="log2(counts)", cex.lab=0.7, panel.first=grid())
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols,
xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) plotDispEsts(dds)res_dds_ht <- results(dds, contrast=c("Genotype","HT","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ht)
##
## out of 17781 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1535, 8.6%
## LFC < 0 (down) : 1118, 6.3%
## outliers [1] : 0, 0%
## low counts [2] : 1035, 5.8%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
metadata(res_dds_ht)$filterThreshold
## 5.816327%
## 4.045919
metadata(res_dds_ht)$alpha
## [1] 0.05
plot(metadata(res_dds_ht)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter", main='Heterozygous',
cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ht)$lo.fit, col="red")
abline(v=metadata(res_dds_ht)$filterTheta)head(res_dds_ht[order(res_dds_ht$padj),])
## log2 fold change (MLE): Genotype HT vs WT
## Wald test p-value: Genotype HT vs WT
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ERV3-1 1392.902 -8.69184 0.349839 -24.8453 2.90981e-136 4.87276e-132
## YPEL3-DT 284.791 4.28892 0.196320 21.8466 8.37509e-106 7.01246e-102
## PCDHA6 249.677 4.67330 0.218532 21.3849 1.84625e-101 1.03058e-97
## ZNF562 345.202 -3.91232 0.183368 -21.3358 5.27814e-101 2.20969e-97
## ZNF273 298.478 -6.44257 0.337653 -19.0805 3.66973e-81 1.22907e-77
## ZNF667-AS1 234.548 -6.43540 0.339162 -18.9744 2.77518e-80 7.74552e-77Genes modulated considering a FDR threshold of 0.1: 3346
Genes modulated considering a FDR threshold of 0.05: 2653
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 1511
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 916
Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.
resAshr_ht <- lfcShrink(dds, contrast=c("Genotype","HT","WT"), res=res_dds_ht, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ht)
##
## out of 17781 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1535, 8.6%
## LFC < 0 (down) : 1118, 6.3%
## outliers [1] : 0, 0%
## low counts [2] : 1035, 5.8%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ht, xlim=xlim, ylim=ylim, main="no LFC shrink")DESeq2::plotMA(resAshr_ht, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")head(resAshr_ht[order(resAshr_ht$padj),])
## log2 fold change (MMSE): Genotype HT vs WT
## Wald test p-value: Genotype HT vs WT
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ERV3-1 1392.902 -8.64521 0.349448 2.90981e-136 4.87276e-132
## YPEL3-DT 284.791 4.26905 0.196690 8.37509e-106 7.01246e-102
## PCDHA6 249.677 4.65034 0.218992 1.84625e-101 1.03058e-97
## ZNF562 345.202 -3.89358 0.183785 5.27814e-101 2.20969e-97
## ZNF273 298.478 -6.40030 0.338176 3.66973e-81 1.22907e-77
## ZNF667-AS1 234.548 -6.39273 0.339699 2.77518e-80 7.74552e-77Genes modulated considering a FDR threshold of 0.1: 3346
Genes modulated considering a FDR threshold of 0.05: 2653
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 1237
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 676
Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.
deseqDEGsHT <- dplyr::filter(data.frame(res_dds_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))
deseqDEGsHTashr <- dplyr::filter(data.frame(resAshr_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))FdrCeil = 1e-10
logFcCeil = 7.5#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')
dplyr::rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (day25 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')
dplyr::rename(as.data.frame(resAshr_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (day25 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)DEAList <- list()
DEAList <- list(dds = dds, #same for both
HT = list(res = res_dds_ht,
DEGs = deseqDEGsHT,
resAshr =resAshr_ht,
DEGsAshr = deseqDEGsHTashr))# RDS
saveRDS(DEAList, paste0(SavingFolder, '/day25CbO.', 'DEAList_HT.rds'))
saveRDS(SE_deseq2, paste0(SavingFolder, '/day25CbO.', 'SE_deseq2_HT.rds'))
saveRDS(res_dds_ht, paste0(SavingFolder, '/day25CbO.', 'deseqHTvsWT.rds'))
SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/day25CbO.', 'DEAnalysisWorkspace_HT.RData'))